INTRODUCTION

leaguedf <- read_csv('../data_sets/S13LeagueOfLegendsData.csv', 
                      col_types=c('c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'c'), 
                      col_names=c('rowno', 'Name', 'Class', 'Role', 'Tier', 'Score', 'Trend', "WinRate", "RoleRate", "PickRate", "BanRate", 'KDA', 'Patch'), skip=1) %>%
  column_to_rownames("rowno") %>% 
  mutate(PickBanRate = PickRate + BanRate, 
         Patch = as.numeric(str_replace(Patch, '(.*?)_(.*?)', '')), 
         Role = str_to_title(Role))
leaguedf$Tier = as.factor(leaguedf$Tier) %>%
  fct_relevel(c("God", "S", "A", "B", "C", "D"))

head(leaguedf, 5)
##     Name    Class Role Tier Score  Trend WinRate RoleRate PickRate BanRate  KDA
## 1 Aatrox  Fighter  Top    S 57.63 -31.86  0.4768   0.9163   0.0662  0.1198 1.77
## 2   Ahri     Mage  Mid    S 57.18   4.55  0.4950   0.9465   0.0581  0.0173 2.58
## 3  Akali Assassin  Mid    S 65.49   4.33  0.4841   0.7574   0.0811  0.1302 2.37
## 4  Akali Assassin  Top    C 39.63  -1.51  0.4592   0.2350   0.0255  0.1302 2.00
## 5 Akshan Marksman  Mid    A 49.39   0.34  0.5162   0.6603   0.0275  0.0379 2.26
##   Patch PickBanRate
## 1     1      0.1860
## 2     1      0.0754
## 3     1      0.2113
## 4     1      0.1557
## 5     1      0.0654

AFTER WORKING WITH THE DATA AND DISCUSSING THE INFORMATION WITH YOUR GROUP, YOU SHOULD DESCRIBE 2 QUESTIONS THAT ARE CREATIVE AND INNOVATIVE. YOU SHOULD EXPLAIN WHY THESE QUESTIONS ARE INTERESTING AND WHY THEY DESERVE FURTHER INVESTIGATION. I ADVISE TO THINK OF REASONS WHY AN OWNER OF THE DATA MIGHT BENEFIT FROM ANSWERS TO THESE QUESTIONS. THINK OF REASONS WHY THE WORLD MAY BE INTERESTED IN THESE QUESITONS. THE PURPOSE OF THE INTRODUCTION IS TO STATE SOME INTERESTING QUESTIONS AND DEFEND THE VALUE OF THESE QUESTIONS. THIS INTRODUCTION SHOULD BE WRITTEN IN A WAY THAT SHOULD GET THE READER EXCITED ABOUT SEEING YOUR RESULTS. THIS SHOULD BE WRITTEN IN NO MORE THAN 4 PARAGRAPHS.

DATA

tempchamps <- leaguedf %>%
  complete(nesting(Name, Role), Patch) %>% # This explicitly finds champions who were only played in a role significantly for less than all of the patches!
  filter(!complete.cases(.)) %>%
  count(Name, Role)

tempdf <- leaguedf %>%
  filter(Name %in% tempchamps$Name & complete.cases(.)) %>%
  group_by(Name, Role) %>%
  summarize(
    n = n(),
    meanWinRate = mean(WinRate),
    sdWinRate = sd(WinRate),
    meanPickBan = mean(PickBanRate),
    sdPickBan = sd(PickBanRate), 
    label = paste(Name, '\n', Role, sep = " ")
  ) %>%
  filter(n != 23)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
tempdf %>%
  mutate(meanWinRate = meanWinRate * 100, meanPickBan = meanPickBan * 100) %>%
  ggplot(aes(x = meanWinRate, y = meanPickBan)) + geom_point(mapping = aes(size = 1/n)) +
  geom_text(aes(label = label, color = Role), size = 3, nudge_x = 0.5, check_overlap = T) + #Consider removing? Is it too much stuff?
  geom_vline(xintercept = 50, color = 'red') + 
  coord_trans(x = 'log10', y = 'log10') + 
  labs(x = "Win Rate Average", y = "Pick Ban Rate Average", title = "Win Rate vs Pick Ban Rate for Temporary champions", subtitle = "Size is invertly proportional to number of patches present") + 
  theme_minimal()

leaguedf %>%
  select("Name", "PickBanRate", "WinRate", "Role", "RoleRate", "Class", "Patch") %>%
  filter(!(Class == "NULL")) %>%
  group_by(Role) %>%
  group_map( ~ plot_ly(data = .,
      x = ~ PickBanRate,
      y = ~ WinRate,
      color = ~ Class,
      text = ~ Name,
      frame = ~ Patch, 
      hoverinfo = "text",
      type = "scatter",
      mode = "markers", 
      marker = list(size = ~ RoleRate*5)
      ), .keep = TRUE) %>%
  subplot(nrows = 2, shareX = TRUE, shareY=TRUE, margin=0.03) %>%
  layout(showlegend = FALSE, title = 'Pick Ban Rate vs. Win Rate by Patch seperated by Role',
         plot_bgcolor='#e5ecf6', 
         xaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'), 
         yaxis = list( 
           zerolinecolor = '#ffff', 
           zerolinewidth = 2, 
           gridcolor = 'ffff'),
         margin = 0.07) %>%
  layout(annotations = annotations)
PbrCorrelation <- MakeCorrelationDf("PickBanRate", "Pbr") %>%
  group_by(Champion2) %>%
  arrange(PbrCorrelation) %>%
  mutate(label = case_when(
    row_number() <= 2 ~ str_to_title(str_replace(Champion1, '\\.', ' ')),
    row_number() > n() - 2 ~ str_to_title(str_replace(Champion1, '\\.', ' ')), # This adds a Space into the name where the . is  and uncapitalizes the second role
    Champion2 == "Tahm Kench.Support" & PbrCorrelation > 0.68 ~ "Senna Support", # This is an outlier so labeling is justified, especially since it helps show the part of the plot
    TRUE ~ as.character(NA)
  )) 

PbrCorrelation %>% filter(Champion2 %in% c("Tahm Kench Support", "Senna Support", "Ashe Adc")) %>% 
  ggplot(mapping = aes(x=Champion2, y = PbrCorrelation)) + 
  geom_boxplot() + 
  ggtitle("PBR Correlation Boxplot")+ 
  scale_x_discrete(labels = c("Ashe Adc", "Senna Support", "Tahm Kench Support")) + 
  labs(x = "", y = "Pick Ban Rate Correlation Coefficient", caption = "Minimum and Maximum corelation coefficients are annotated, as well as Senna Support for Tahm Kench Support in order\n to best visualize how the strength of certain counters, replacements, and synergies effect Pick Ban Rate.")  + 
  geom_text(aes(label = label), na.rm = TRUE, hjust = -0.1, size = 3, check_overlap = T) 

leaguedf %>% 
  group_by(Name) %>% 
  summarise(Mean_pick=mean(PickRate, na.rm = TRUE), Std_pick=sd(PickRate, na.rm=TRUE), Mean_win=mean(WinRate, na.rm = TRUE), Std_win=sd(WinRate, na.rm=TRUE)) %>% 
  arrange(desc(Mean_pick)) %>% 
  ggplot(aes(Mean_pick, Std_win)) + geom_point() + labs(x = "Mean Pick Rate", y = "Standard Deviation Win Rate") + 
  coord_trans(x = 'log10', y = 'log10') + 
  geom_smooth(aes(x = Mean_pick, y = Std_win), method = 'lm', se = F) # THIS LOOKS NON LINEAR BUT IT IS LINEAR, ITS JUST ON A LOG SCALE!!!!
## `geom_smooth()` using formula = 'y ~ x'

plot1 <- leaguedf %>%
  filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
  ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = PickRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Pick Rate")

plot2 <- leaguedf %>%
  filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
  ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = BanRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Ban Rate")

plot3 <- leaguedf %>%
  filter(Name %in% c("Fiora", "Darius", "Garen", "Aatrox", "Jax"), Role == "Top") %>%
  ggplot() + geom_count(aes(x = as.factor(Patch), y = Name, size = WinRate, color = Name)) + labs(x = "Patch", y = "Name", title = "Win Rate")

(plot1 / plot2/ plot3) + plot_annotation(title = "Analysis of Staple Top Champions")

IN LESS THAN 6 PARAGRAPHS, YOU SHOULD DESCRIBE THE DATA USED TO ANSWER THE QUESTIONS. YOU SHOULD EXPLAIN WHERE THE DATA ORIGINATED. FOR EXAMPLE, IT IS GOOD TO KNOW WHO COLLECTED THE DATA. JUST BECAUSE THE DATA CAME FROM KAGGLE, DOESN’T MEAN KAGGLE.COM COLLECTED THE DATA. GIVE AN IN-DEPTH DESCRIPTION OF THE SPECIFIC VARIABLES IN THE DATA REQUIRED TO ANSWER YOUR QUESTIONS. YOU SHOULDN’T DISCUSS ALL VARIABLES IN THE DATA IF YOU DIDN’T USE ALL VARIABLES IN THE DATA. YOU SHOULD EXPLAIN WHAT EACH OBSERVATION REPRESENTS (I.E. PEOPLE, SCHOOLS, STATES, CITIES, PATIENTS FROM A SPECIFIC HOSPITAL). WHAT IS THIS A SAMPLE OF? HOW MANY OBSERVATIONS DO YOU HAVE? AFTER READING THIS SECTION, THE READER SHOULD CLEARLY UNDERSTAND THE SOURCE AND CONTENT OF THE DATA YOU PLAN ON UTILIZING TO ANSWER YOUR QUESTIONS THAT YOU PROPOSED IN THE INTRODUCTION. AT LEAST ONE, DESCRIPTIVE TABLE AND AT LEAST ONE FIGURE SHOULD BE USED HERE TO HELP THE READER UNDERSTAND WHAT THE DATA LOOKS LIKE WITHOUT SEEING THE ENTIRE DATASET. IN ALL FIGURES AND TABLES, ONLY THE VARIABLES OF INTEREST SHOULD BE USED.

RESULTS

#Cluster Analysis with K-Means


#Step 1: Normalize Data:
#First drop icky Vars and then Dummy encode Name, Class, and Role
#This is a high dimensional Data set

Normaldf <- leaguedf %>%
  select(-c(Tier, Score, Trend, PickRate, BanRate)) %>%
    pivot_wider(names_from = Role,
              values_from = Role,
              values_fn = function(x) 1,
              values_fill = 0) %>%
    mutate(Class = paste("Class: ", Class, sep = '')) %>%
    pivot_wider(names_from = Class,
                values_from = Class,
                values_fn = function(x) 1,
                values_fill = 0) %>%
    pivot_wider(names_from = Name,
                values_from = Name,
                values_fn = function(x) 1,
                values_fill =0) %>%
  mutate(
    WinRate = (WinRate - mean(WinRate))/sd(WinRate),
    RoleRate = (RoleRate - mean(RoleRate))/ sd(RoleRate),
    PickBanRate = (PickBanRate - mean(PickBanRate)) / sd(PickBanRate),
    KDA = (KDA - mean(KDA)) / sd(KDA),
    Patch = (Patch -mean(Patch)) / sd(Patch)
  )
  
#Step 2: Clusterize the Data

data <- kmeans(Normaldf, centers = 6, nstart = 25)

leaguedf$Cluster = as.factor(data$cluster)

#Reproducibility for Graphing purposes

ordering <- leaguedf %>%
  group_by(Cluster) %>%
  summarize(RoleRate = mean(RoleRate)) %>%
  arrange(RoleRate) %>%
  mutate(transformation = row_number())

transform <- function (x) {
  temp <- ordering %>%
    filter(Cluster == x)
  return (temp[[1, 3]])
}

leaguedf$Cluster <- sapply(leaguedf$Cluster, transform)

leaguedf <- leaguedf %>%
  mutate(Cluster = as.factor(Cluster))

plot1a <- leaguedf %>%
  ggplot() + 
  geom_point(mapping = aes(x = KDA, y = WinRate, color = Cluster), size = 0.75, alpha = 0.4) + 
  labs(x = "KDA", y = "Win Rate") + 
  theme_minimal()+ 
  theme(legend.position = "none") + 
  scale_color_manual(values = KMeansPalette)

plot1b <- leaguedf %>%
  ggplot() +
  geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = Cluster), size = 0.75, alpha = 0.4) + 
  labs(x= "Pick/Ban Rate", y= "") + 
  theme_minimal()+ 
  scale_color_manual(values = KMeansPalette)+ 
  theme(legend.position = "bottom")+
     guides(color = guide_legend(override.aes = list(size = 3) ) )

plot1c <- leaguedf %>%
  ggplot() + 
  geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = Cluster), lwd = 0.5) +
    labs(x = "Role", y = "Role %") + 
  theme_minimal() + 
  scale_color_manual(values = KMeansPalette)+ 
  theme(legend.position = "none")
design <- "
12
12
12
12
33
33
33
33
44"

KMeans <- wrap_elements(plot1a + plot1b + plot1c + guide_area() + 
  plot_layout(design = design, guides = "collect") &
  plot_annotation(title = "K Means"))


plot1a <- leaguedf %>%
  ggplot() + 
  geom_point(mapping = aes(x = KDA, y = WinRate, color = Tier), size = 0.75, alpha = 0.4) + 
  labs(x = "KDA", y = "Win Rate") + 
  theme_minimal()+ 
  scale_color_manual(values = TierPalette)+ 
  theme(legend.position = "none")

plot1b <- leaguedf %>%
  ggplot() +
  geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = Tier), size = 0.75, alpha = 0.4) + 
  labs(x= "Pick/Ban Rate", y= "") + 
  theme_minimal()+ 
  scale_color_manual(values = TierPalette)+ 
  theme(legend.position = "bottom")+
     guides(color = guide_legend(override.aes = list(size = 3)))

plot1c <- leaguedf %>%
  ggplot() + 
  geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = Tier), lwd= 0.5) + 
  labs(x = "Role", y = "Role %") + 
  scale_color_manual(values = TierPalette)+ 
  theme_minimal() + 
  theme(legend.position = "none")


Meta_Tiers <- wrap_elements(plot1a + plot1b + plot1c + guide_area() + 
  plot_layout(design = design, guides = "collect") &
  plot_annotation(title = "Meta SRC Tier"))


(KMeans | Meta_Tiers) & plot_annotation(title = "Cluster Analysis") & 
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold')) 

#Hierarchical Clustering
HCluster <- hclust(dist(Normaldf))

plot(HCluster, xlab = '', sub = '', cex = .9) #Dendrogram!!!

leaguedf$HClust <- as.factor(cutree(HCluster, 5))

plot1a <- leaguedf %>%
  ggplot() + 
  geom_point(mapping = aes(x = KDA, y = WinRate, color = HClust), size = 0.6, alpha = 0.8) + 
  labs(x = "KDA", y = "Win Rate") + 
  theme_minimal()+ 
  theme(legend.position = "none")

plot1b <- leaguedf %>%
  ggplot() +
  geom_point(mapping = aes(x = PickBanRate, y = WinRate, color = HClust), size = 0.6, alpha = 0.8) + 
  labs(x= "Pick/Ban Rate", y= "Win Rate") + 
  theme_minimal()+ 
  theme(legend.position = "right")+
     guides(color = guide_legend(override.aes = list(size = 3) ) )

plot1c <- leaguedf %>%
  ggplot() + 
  geom_boxplot(mapping = aes(x = Role, y = RoleRate, color = HClust), lwd = 0.5) +
    labs(x = "Role", y = "Role %") + 
  theme_minimal() + 
  theme(legend.position = "none")



((plot1a | plot1b) / plot1c )& 
  plot_layout(guides = "collect") &
  plot_annotation(title = "Hierarchical Cluster Analysis")  & theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))

leaguedf %>%
  rename(`Hierarchical Cluster` = HClust) %>%
  group_by(`Hierarchical Cluster`) %>%
  summarize(
            `Mean Win Rate` = mean(WinRate),
            `Mean PB Rate` = mean(PickBanRate),
            `Mean Role %` = mean(RoleRate),
            `Mean KDA` = mean(KDA),
            `Median Patch` = median(Patch),
            `Number of Champs` = n_distinct(Name)) %>%
  kbl() %>%
  kable_classic(full_width = F, html_font = "Times New Roman")
Hierarchical Cluster Mean Win Rate Mean PB Rate Mean Role % Mean KDA Median Patch Number of Champs
1 0.4963903 0.0988015 0.5624936 2.089316 11.0 130
2 0.5124245 0.0760563 0.6885434 2.536914 15.0 140
3 0.4449148 0.1436213 0.2178475 2.146066 8.0 12
4 0.5089792 0.4172494 0.8719532 2.705065 11.0 13
5 0.4639125 0.1569875 0.9865125 4.143750 16.5 1
Propdf <- leaguedf %>%
  group_by(Tier, Cluster) %>%
  summarize(count = n()) %>%
  ungroup() %>%
  group_by(Tier) %>%
  mutate(Proportion = count / sum(count)) %>%
  arrange(desc(Proportion))

heatplot <- Propdf %>%
  ggplot() +
  geom_tile(mapping = aes(x = Tier, y = Cluster, fill = Proportion)) & 
  theme_minimal()

heatplot & plot_annotation(title = "Similarity Heatmap", subtitle="This should feel close to a confusion matrix")

Part 2: Classification Exploration

#Feature Selection

boruta <- Boruta(Tier ~ ., data = select(leaguedf, -c(Score, Trend, Cluster, HClust)))

plot(boruta, las = 2, cex.axis = 0.7)

From the Boruta plot, we can see that WinRate is likely most important, and importantly it will be better for our model to train it on PickRate and BanRate as opposed to PickBanRate. While PickBanRate is still an important variable, we choose to not include it as it overrepresents the value of Picks and Bans in a champions tier, where it could potentially be counted twice as opposed to WinRate, which could only be counted once. Additionally, we see that Patch is least important, but is not unimportant in prediction, which is interesting and suggests that there possibly may be differences between Tier Distributions for patches at some level.

#Data Splitting 

ModelFrame <- leaguedf %>%
  select(-c(PickBanRate, HClust, Cluster, Score, Trend))

for (colname in colnames(ModelFrame)) {
  
  if (is.numeric(ModelFrame[[colname]])) {
    ModelFrame[[colname]] = c(scale(ModelFrame[[colname]]))
  }
}



inTrain <- createDataPartition(y=ModelFrame$Tier, p=0.75, list=FALSE) 

leagueTrain <- ModelFrame[ inTrain, ]
leagueTest <- ModelFrame[ -inTrain, ]
#Model Fitting
grid <- expand.grid(mtry = c(6,9,12), splitrule=c("extratrees", "gini"), min.node.size=c(1,3,6,10))

fitControl <- trainControl(method='CV', 
                           number = 5, 
                           classProbs = TRUE,
                           verboseIter=FALSE)

rf_fit <- train(Tier ~ .,
             data=leagueTrain,
             method="ranger",
             tuneGrid=grid,
             trControl = fitControl, importance="impurity"
)

saveRDS(rf_fit, "RF_Fit.rds")
grid <- expand.grid(C = seq(0, 10, length = 20))


fitControl <- trainControl(method='CV', 
                           number = 5, 
                           verboseIter=FALSE)

svmlin_fit <- train(Tier ~ .,
                 data = leagueTrain,
                 method="svmLinear",
                 trControl = fitControl,
                 tuneGrid = grid)

saveRDS(svmlin_fit, "SVMLin_Fit.rds")
grid <- expand.grid(C = seq(4, 5, length = 5), sigma = seq(0.20, 0.3, length = 5))


fitControl <- trainControl(method='CV', 
                           number = 3, 
                           verboseIter=F)

svmrad_fit <- train(Tier ~ .,
                 data = leagueTrain,
                 method="svmRadial",
                 trControl = fitControl,
                 tuneGrid = grid)

saveRDS(svmrad_fit, "SVMRad_Fit.rds")
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

Boundarydf = expand.grid(WinRate = seq(0.4, 0.6, out=30), PickRate = seq(0.0, 0))
Boundarydf$tier <- predict(fit, Boundarydf)
Boundarydf <- Boundarydf %>%
  group_by(winr, pbr) %>%
  summarize(tier = Mode(tier)) %>%
  ungroup()

write.csv(Boundarydf, "BoundaryFrame.csv")
#Visualization #1 for Ranger Model
acc_plot <- fit$results %>%  
  mutate(mtry = as.character(mtry), min.node.size = as.character(min.node.size)) %>%
  mutate(mtry = factor(mtry, levels=c("6", "9", "12", "15"), ordered=TRUE),
         min.node.size = factor(min.node.size, levels = c("1","3", "6", "10"), ordered=TRUE)) %>%
  ggplot() + geom_tile(mapping = aes(x=mtry, y=min.node.size, fill=Accuracy)) + facet_wrap(splitrule ~.) + ggtitle("Accuracy for each value") + theme(legend.position = "bottom")

rf_pred <- predict(fit, leagueTest)

cm <- confusionMatrix(rf_pred, leagueTest$Tier, dnn = c("Prediction", "Actual"))
plt <- as.data.frame(cm$table) %>%
  group_by(Actual) %>%
  mutate(Percent = Freq / sum(Freq))

plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
cm_plot <- ggplot(plt, aes(Prediction,Actual, fill=Percent)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Actual",y = "Prediction", title = "Confusion Matrix") +
        scale_y_discrete(labels=c('D', 'C', 'B', 'A', 'S', 'God')) +
        scale_x_discrete(labels=c('God', 'S', 'A', 'B', 'C', 'D')) + theme_minimal() + theme(legend.position = "bottom")

boundary_plot <- leaguedf %>%
  filter(Role == 'Top'& Patch == 23 & (Class == "Fighter"| Class == "Tank"))  %>%
  ggplot() + geom_point(mapping = aes(x = KDA, y = WinRate, color = Tier)) + geom_raster
  
(acc_plot + cm_plot) /boundary_plot
#Cross validation is expected
#This currently uses train-test split, but we shold change to something else? LOOCV? Or Divide n conquer?
leagueTest$sample <- floor(runif(nrow(leagueTest), min = 0, max = 10))

leagueTest %>%
  mutate(correct_lin = Pred_svmlin == Tier,
         correct_rad = Pred_svmrad == Tier,
         correct_rf = Pred_rf == Tier) %>%
  summarize(Linear_Accuracy = mean(correct_lin), Radial_Accuracy = mean(correct_rad), RF_Accuracy = mean(correct_rf),
            n = n(), 
            Linear_sd = sd(correct_lin), Radial_sd = sd(correct_rad), RF_sd = sd(correct_rf))
##   Linear_Accuracy Radial_Accuracy RF_Accuracy    n Linear_sd Radial_sd
## 1       0.8295455       0.8536932   0.8536932 1408 0.3761652 0.3535391
##       RF_sd
## 1 0.3535391
#Special Analysis just for TempChamps
#TODO: Fix table format so it isn't disgusting
leagueTest %>%
  group_by(Name, Role) %>%
  mutate(n = n()) %>%
  ungroup()  %>%
  mutate(Temp = n < 23, correct = Tier == Pred_svmlin) %>%
  group_by(Temp) %>%
  summarize(Acc = mean(correct), n = n(), sd = sd(correct)) %>%
  kbl() %>%
  kable_classic()
Temp Acc n sd
TRUE 0.8295455 1408 0.3761652
#Assume that the true probablility is correct. Then, we have H0 as Temp >= NonTemp, and H1 as Temp < NonTemp.

table <- tibble(
  NonTemporary = 0.827, Temporary = 0.766, NonTemporaryN = 4876, TemporaryN = 761, sdTemporary = 0.423, sdNonTemporary = 0.378
) %>%
  mutate(diff  = NonTemporary - Temporary, z = diff/ (sqrt(sdTemporary^2 / TemporaryN + sdNonTemporary^2 / NonTemporaryN)), p = pnorm(q = z, lower.tail = F)) %>%
  kbl() %>%
  kable_classic()


table
NonTemporary Temporary NonTemporaryN TemporaryN sdTemporary sdNonTemporary diff z p
0.827 0.766 4876 761 0.423 0.378 0.061 3.751258 8.8e-05

CONCLUSION

IN LESS THAN 4 PARAGRAPHS, YOU SHOULD RESTATE YOUR QUESTIONS ALONG WITH YOUR CONCLUSIONS. THE PURPOSE OF THIS SECTION IS TO SUMMARIZE YOUR FINDINGS (SHORT), DEFEND THE IMPORTANCE OF YOUR RESULTS IN THE REAL WORLD (LONG), AND PROVIDE A ROADMAP FOR OTHERS TO CONTINUE THIS WORK (LONG). ARE YOUR CONCLUSIONS WHAT YOU EXPECTED OR UNUSUAL? WHY SHOULD SOMEONE CARE ABOUT THESE RESULTS? HOW COULD THESE RESULTS BE USED IN THE REAL WORLD? YOU SHOULD PROVIDE IDEAS ABOUT FUTURE DIRECTIONS ON WHERE YOUR MODELING COULD POSSIBLY BE IMPROVED. ARE THERE ANY METHODS YOU DIDN’T USE THAT MAY WORK BETTER? IS THERE DATA YOU DIDN’T HAVE ACCESS TO THAT MAY BE USEFUL IN THIS DATA ANALYSIS?